{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import json\n",
    "import gzip\n",
    "import pandas as pd\n",
    "from urllib.request import urlopen"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def parse(path):\n",
    "  g = gzip.open(path, 'r')\n",
    "  for l in g:\n",
    "    yield json.loads(l)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import gzip\n",
    "import json\n",
    "\n",
    "def parse(path):\n",
    "  g = gzip.open(path, 'rb')\n",
    "  for l in g:\n",
    "    yield json.loads(l)\n",
    "\n",
    "def getDF(path):\n",
    "  i = 0\n",
    "  df = {}\n",
    "  for d in parse(path):\n",
    "    df[i] = d\n",
    "    i += 1\n",
    "  return pd.DataFrame.from_dict(df, orient='index')\n",
    "\n",
    "df = getDF('/data/zyang/datasets/amazon_book/Books_5.json.gz')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>overall</th>\n",
       "      <th>verified</th>\n",
       "      <th>reviewTime</th>\n",
       "      <th>reviewerID</th>\n",
       "      <th>asin</th>\n",
       "      <th>style</th>\n",
       "      <th>reviewerName</th>\n",
       "      <th>reviewText</th>\n",
       "      <th>summary</th>\n",
       "      <th>unixReviewTime</th>\n",
       "      <th>vote</th>\n",
       "      <th>image</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>5.0</td>\n",
       "      <td>False</td>\n",
       "      <td>03 30, 2005</td>\n",
       "      <td>A1REUF3A1YCPHM</td>\n",
       "      <td>0001713353</td>\n",
       "      <td>{'Format:': ' Hardcover'}</td>\n",
       "      <td>TW Ervin II</td>\n",
       "      <td>The King, the Mice and the Cheese by Nancy Gur...</td>\n",
       "      <td>A story children will love and learn from</td>\n",
       "      <td>1112140800</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>5.0</td>\n",
       "      <td>True</td>\n",
       "      <td>06 20, 2016</td>\n",
       "      <td>AVP0HXC9FG790</td>\n",
       "      <td>0001713353</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Amazon Customer</td>\n",
       "      <td>The kids loved it!</td>\n",
       "      <td>Five Stars</td>\n",
       "      <td>1466380800</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   overall  verified   reviewTime      reviewerID        asin  \\\n",
       "0      5.0     False  03 30, 2005  A1REUF3A1YCPHM  0001713353   \n",
       "1      5.0      True  06 20, 2016   AVP0HXC9FG790  0001713353   \n",
       "\n",
       "                       style     reviewerName  \\\n",
       "0  {'Format:': ' Hardcover'}      TW Ervin II   \n",
       "1                        NaN  Amazon Customer   \n",
       "\n",
       "                                          reviewText  \\\n",
       "0  The King, the Mice and the Cheese by Nancy Gur...   \n",
       "1                                 The kids loved it!   \n",
       "\n",
       "                                     summary  unixReviewTime vote image  \n",
       "0  A story children will love and learn from      1112140800  NaN   NaN  \n",
       "1                                 Five Stars      1466380800  NaN   NaN  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2934949\n",
      "{'category': [], 'tech1': '', 'description': ['It is a biology book with God&apos;s perspective.'], 'fit': '', 'title': 'Biology Gods Living Creation Third Edition 10 (A Beka Book Science Series)', 'also_buy': ['0669009075', 'B000K2P5SA', 'B00MD4G2N0', 'B000ASIPTK', '0130508470', '1892427524', '0321567919', 'B000BJBH20', '0547484631', 'B000HAJTQO', 'B000AUCX7I', '0130365645', 'B000BI1Y2O', '0395976715', '052817729X', '1579246443', 'B001CK63XK', '1591669847', '0395879884', '836585161X', 'B01J2F9BH6', 'B00KYEHR4E', '158008141X', '1857928393', '0927545829', 'B015AR0RA0', 'B000TVHHRE', '0865167990', '1579246052', 'B003NXXVD4', 'B000OH6AX0', '061802087X', 'B000NU2X02', '0743252012'], 'tech2': '', 'brand': 'Keith Graham', 'feature': [], 'rank': '1,349,781 in Books (', 'also_view': ['0019777701', 'B000AUCX7I', 'B000K2P5SA', 'B001CK63XK', 'B01J2F9BH6', 'B000BI1Y2O', '1932012540', 'B0095ZCRCK'], 'main_cat': 'Books', 'similar_item': '', 'date': '', 'price': '$39.94', 'asin': '0000092878', 'imageURL': [], 'imageURLHighRes': []}\n",
      "2934949\n"
     ]
    }
   ],
   "source": [
    "metadata = []\n",
    "with gzip.open('/data/zyang/datasets/amazon_book/meta_Books.json.gz') as f:\n",
    "    for l in f:\n",
    "        metadata.append(json.loads(l.strip()))\n",
    "    \n",
    "# total length of list, this number equals total number of products\n",
    "print(len(metadata))\n",
    "\n",
    "# first row of the list\n",
    "print(metadata[0])\n",
    "\n",
    "# convert list into pandas dataframe\n",
    "\n",
    "df_meta = pd.DataFrame.from_dict(metadata)\n",
    "\n",
    "print(len(df_meta))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n",
      "2934949\n"
     ]
    }
   ],
   "source": [
    "df3 = df_meta.fillna('')\n",
    "df4 = df3[df3.title.str.contains('getTime')] # unformatted rows\n",
    "df5 = df3[~df3.title.str.contains('getTime')] # filter those unformatted rows\n",
    "print(len(df4))\n",
    "print(len(df5))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "count    2.716498e+07\n",
       "mean     4.374226e+00\n",
       "std      9.986682e-01\n",
       "min      0.000000e+00\n",
       "25%      4.000000e+00\n",
       "50%      5.000000e+00\n",
       "75%      5.000000e+00\n",
       "max      5.000000e+00\n",
       "Name: overall, dtype: float64"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['overall'].dropna().describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['overall', 'verified', 'reviewTime', 'reviewerID', 'asin', 'style',\n",
       "       'reviewerName', 'reviewText', 'summary', 'unixReviewTime', 'vote',\n",
       "       'image'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>overall</th>\n",
       "      <th>reviewTime</th>\n",
       "      <th>reviewerID</th>\n",
       "      <th>asin</th>\n",
       "      <th>vote</th>\n",
       "      <th>unixReviewTime</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>5.0</td>\n",
       "      <td>03 30, 2005</td>\n",
       "      <td>A1REUF3A1YCPHM</td>\n",
       "      <td>0001713353</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1112140800</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>5.0</td>\n",
       "      <td>06 20, 2016</td>\n",
       "      <td>AVP0HXC9FG790</td>\n",
       "      <td>0001713353</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1466380800</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   overall   reviewTime      reviewerID        asin vote  unixReviewTime\n",
       "0      5.0  03 30, 2005  A1REUF3A1YCPHM  0001713353  NaN      1112140800\n",
       "1      5.0  06 20, 2016   AVP0HXC9FG790  0001713353  NaN      1466380800"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rating = df[['overall','reviewTime','reviewerID','asin','vote','unixReviewTime']]\n",
    "rating.head(2) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "rating = rating[['asin','reviewerID','overall','unixReviewTime']]\n",
    "rating.columns = ['asin','user','rating','timestamp']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['category', 'tech1', 'description', 'fit', 'title', 'also_buy', 'tech2',\n",
       "       'brand', 'feature', 'rank', 'also_view', 'main_cat', 'similar_item',\n",
       "       'date', 'price', 'asin', 'imageURL', 'imageURLHighRes', 'details'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df5.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>asin</th>\n",
       "      <th>category</th>\n",
       "      <th>title</th>\n",
       "      <th>brand</th>\n",
       "      <th>price</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0000092878</td>\n",
       "      <td>[]</td>\n",
       "      <td>Biology Gods Living Creation Third Edition 10 ...</td>\n",
       "      <td>Keith Graham</td>\n",
       "      <td>$39.94</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>000047715X</td>\n",
       "      <td>[Books, New, Used &amp; Rental Textbooks, Medicine...</td>\n",
       "      <td>Mksap 16 Audio Companion: Medical Knowledge Se...</td>\n",
       "      <td>Acp</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         asin                                           category  \\\n",
       "0  0000092878                                                 []   \n",
       "1  000047715X  [Books, New, Used & Rental Textbooks, Medicine...   \n",
       "\n",
       "                                               title         brand   price  \n",
       "0  Biology Gods Living Creation Third Edition 10 ...  Keith Graham  $39.94  \n",
       "1  Mksap 16 Audio Companion: Medical Knowledge Se...           Acp          "
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "meta_data = df5[['asin','category','title','brand','price']]\n",
    "meta_data.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(2934949, 5)"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "meta_data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(2930024, 5)"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "meta_data = meta_data.drop_duplicates(subset=['asin'],keep='last')\n",
    "meta_data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>asin</th>\n",
       "      <th>category</th>\n",
       "      <th>title</th>\n",
       "      <th>brand</th>\n",
       "      <th>price</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0000092878</td>\n",
       "      <td>[]</td>\n",
       "      <td>Biology Gods Living Creation Third Edition 10 ...</td>\n",
       "      <td>Keith Graham</td>\n",
       "      <td>$39.94</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>000047715X</td>\n",
       "      <td>[Books, New, Used &amp; Rental Textbooks, Medicine...</td>\n",
       "      <td>Mksap 16 Audio Companion: Medical Knowledge Se...</td>\n",
       "      <td>Acp</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         asin                                           category  \\\n",
       "0  0000092878                                                 []   \n",
       "1  000047715X  [Books, New, Used & Rental Textbooks, Medicine...   \n",
       "\n",
       "                                               title         brand   price  \n",
       "0  Biology Gods Living Creation Third Edition 10 ...  Keith Graham  $39.94  \n",
       "1  Mksap 16 Audio Companion: Medical Knowledge Se...           Acp          "
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "meta_data.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = rating.merge(meta_data,on='asin',how='right')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>asin</th>\n",
       "      <th>user</th>\n",
       "      <th>rating</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>category</th>\n",
       "      <th>title</th>\n",
       "      <th>brand</th>\n",
       "      <th>price</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0000092878</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[]</td>\n",
       "      <td>Biology Gods Living Creation Third Edition 10 ...</td>\n",
       "      <td>Keith Graham</td>\n",
       "      <td>$39.94</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>000047715X</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[Books, New, Used &amp; Rental Textbooks, Medicine...</td>\n",
       "      <td>Mksap 16 Audio Companion: Medical Knowledge Se...</td>\n",
       "      <td>Acp</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         asin user  rating  timestamp  \\\n",
       "0  0000092878  NaN     NaN        NaN   \n",
       "1  000047715X  NaN     NaN        NaN   \n",
       "\n",
       "                                            category  \\\n",
       "0                                                 []   \n",
       "1  [Books, New, Used & Rental Textbooks, Medicine...   \n",
       "\n",
       "                                               title         brand   price  \n",
       "0  Biology Gods Living Creation Third Edition 10 ...  Keith Graham  $39.94  \n",
       "1  Mksap 16 Audio Companion: Medical Knowledge Se...           Acp          "
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(29389818, 8)"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = data.dropna()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(27163817, 8)"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>asin</th>\n",
       "      <th>user</th>\n",
       "      <th>rating</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>category</th>\n",
       "      <th>title</th>\n",
       "      <th>brand</th>\n",
       "      <th>price</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0000013765</td>\n",
       "      <td>AYEDW3BFK53XK</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1.325462e+09</td>\n",
       "      <td>[Books, Arts &amp; Photography, Music]</td>\n",
       "      <td>Heavenly Highway Hymns: Shaped-Note Hymnal</td>\n",
       "      <td>Stamps/Baxter</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0000013765</td>\n",
       "      <td>A2SUAM1J3GNN3B</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1.252800e+09</td>\n",
       "      <td>[Books, Arts &amp; Photography, Music]</td>\n",
       "      <td>Heavenly Highway Hymns: Shaped-Note Hymnal</td>\n",
       "      <td>Stamps/Baxter</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0000013765</td>\n",
       "      <td>A1NGDVXI2BAG7C</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1.407715e+09</td>\n",
       "      <td>[Books, Arts &amp; Photography, Music]</td>\n",
       "      <td>Heavenly Highway Hymns: Shaped-Note Hymnal</td>\n",
       "      <td>Stamps/Baxter</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>0000013765</td>\n",
       "      <td>AEI062TCE8IGJ</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1.452902e+09</td>\n",
       "      <td>[Books, Arts &amp; Photography, Music]</td>\n",
       "      <td>Heavenly Highway Hymns: Shaped-Note Hymnal</td>\n",
       "      <td>Stamps/Baxter</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>0000013765</td>\n",
       "      <td>A2GDDTUYPEUIF7</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1.413936e+09</td>\n",
       "      <td>[Books, Arts &amp; Photography, Music]</td>\n",
       "      <td>Heavenly Highway Hymns: Shaped-Note Hymnal</td>\n",
       "      <td>Stamps/Baxter</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         asin            user  rating     timestamp  \\\n",
       "3  0000013765   AYEDW3BFK53XK     5.0  1.325462e+09   \n",
       "4  0000013765  A2SUAM1J3GNN3B     5.0  1.252800e+09   \n",
       "5  0000013765  A1NGDVXI2BAG7C     5.0  1.407715e+09   \n",
       "6  0000013765   AEI062TCE8IGJ     5.0  1.452902e+09   \n",
       "7  0000013765  A2GDDTUYPEUIF7     5.0  1.413936e+09   \n",
       "\n",
       "                             category  \\\n",
       "3  [Books, Arts & Photography, Music]   \n",
       "4  [Books, Arts & Photography, Music]   \n",
       "5  [Books, Arts & Photography, Music]   \n",
       "6  [Books, Arts & Photography, Music]   \n",
       "7  [Books, Arts & Photography, Music]   \n",
       "\n",
       "                                        title          brand price  \n",
       "3  Heavenly Highway Hymns: Shaped-Note Hymnal  Stamps/Baxter        \n",
       "4  Heavenly Highway Hymns: Shaped-Note Hymnal  Stamps/Baxter        \n",
       "5  Heavenly Highway Hymns: Shaped-Note Hymnal  Stamps/Baxter        \n",
       "6  Heavenly Highway Hymns: Shaped-Note Hymnal  Stamps/Baxter        \n",
       "7  Heavenly Highway Hymns: Shaped-Note Hymnal  Stamps/Baxter        "
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>asin</th>\n",
       "      <th>user</th>\n",
       "      <th>rating</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>category</th>\n",
       "      <th>title</th>\n",
       "      <th>brand</th>\n",
       "      <th>price</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0000013765</td>\n",
       "      <td>AYEDW3BFK53XK</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1.325462e+09</td>\n",
       "      <td>[Books, Arts &amp; Photography, Music]</td>\n",
       "      <td>Heavenly Highway Hymns: Shaped-Note Hymnal</td>\n",
       "      <td>Stamps/Baxter</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0000013765</td>\n",
       "      <td>A2SUAM1J3GNN3B</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1.252800e+09</td>\n",
       "      <td>[Books, Arts &amp; Photography, Music]</td>\n",
       "      <td>Heavenly Highway Hymns: Shaped-Note Hymnal</td>\n",
       "      <td>Stamps/Baxter</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         asin            user  rating     timestamp  \\\n",
       "3  0000013765   AYEDW3BFK53XK     5.0  1.325462e+09   \n",
       "4  0000013765  A2SUAM1J3GNN3B     5.0  1.252800e+09   \n",
       "\n",
       "                             category  \\\n",
       "3  [Books, Arts & Photography, Music]   \n",
       "4  [Books, Arts & Photography, Music]   \n",
       "\n",
       "                                        title          brand price  \n",
       "3  Heavenly Highway Hymns: Shaped-Note Hymnal  Stamps/Baxter        \n",
       "4  Heavenly Highway Hymns: Shaped-Note Hymnal  Stamps/Baxter        "
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rating_ = data.copy()\n",
    "rating_.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>iid</th>\n",
       "      <th>uid</th>\n",
       "      <th>rating</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>category</th>\n",
       "      <th>title</th>\n",
       "      <th>brand</th>\n",
       "      <th>price</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0000013765</td>\n",
       "      <td>AYEDW3BFK53XK</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1.325462e+09</td>\n",
       "      <td>[Books, Arts &amp; Photography, Music]</td>\n",
       "      <td>Heavenly Highway Hymns: Shaped-Note Hymnal</td>\n",
       "      <td>Stamps/Baxter</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0000013765</td>\n",
       "      <td>A2SUAM1J3GNN3B</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1.252800e+09</td>\n",
       "      <td>[Books, Arts &amp; Photography, Music]</td>\n",
       "      <td>Heavenly Highway Hymns: Shaped-Note Hymnal</td>\n",
       "      <td>Stamps/Baxter</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          iid             uid  rating     timestamp  \\\n",
       "3  0000013765   AYEDW3BFK53XK     5.0  1.325462e+09   \n",
       "4  0000013765  A2SUAM1J3GNN3B     5.0  1.252800e+09   \n",
       "\n",
       "                             category  \\\n",
       "3  [Books, Arts & Photography, Music]   \n",
       "4  [Books, Arts & Photography, Music]   \n",
       "\n",
       "                                        title          brand price  \n",
       "3  Heavenly Highway Hymns: Shaped-Note Hymnal  Stamps/Baxter        \n",
       "4  Heavenly Highway Hymns: Shaped-Note Hymnal  Stamps/Baxter        "
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rating_.columns = ['iid','uid','rating','timestamp','category','title','brand','price']\n",
    "rating_.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "# rating_ = pd.merge(rating,item_info,on='iid',how='inner')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(Timestamp('1996-05-20 00:00:00'), Timestamp('2018-10-02 00:00:00'))"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "date_min = pd.to_datetime(rating_.timestamp,unit='s').min() \n",
    "date_max = pd.to_datetime(rating_.timestamp,unit='s').max()\n",
    "date_min,date_max"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(27163817, 8)"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rating_.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Timedelta('215 days 00:00:00')"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "date_gap = (date_max-date_min)//(19*2)\n",
    "date_gap"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "rating_['time'] = pd.to_datetime(rating_.timestamp, unit='s').map(lambda x: x.year )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>iid</th>\n",
       "      <th>uid</th>\n",
       "      <th>rating</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>category</th>\n",
       "      <th>title</th>\n",
       "      <th>brand</th>\n",
       "      <th>price</th>\n",
       "      <th>time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0000013765</td>\n",
       "      <td>AYEDW3BFK53XK</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1.325462e+09</td>\n",
       "      <td>[Books, Arts &amp; Photography, Music]</td>\n",
       "      <td>Heavenly Highway Hymns: Shaped-Note Hymnal</td>\n",
       "      <td>Stamps/Baxter</td>\n",
       "      <td></td>\n",
       "      <td>2012</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0000013765</td>\n",
       "      <td>A2SUAM1J3GNN3B</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1.252800e+09</td>\n",
       "      <td>[Books, Arts &amp; Photography, Music]</td>\n",
       "      <td>Heavenly Highway Hymns: Shaped-Note Hymnal</td>\n",
       "      <td>Stamps/Baxter</td>\n",
       "      <td></td>\n",
       "      <td>2009</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          iid             uid  rating     timestamp  \\\n",
       "3  0000013765   AYEDW3BFK53XK     5.0  1.325462e+09   \n",
       "4  0000013765  A2SUAM1J3GNN3B     5.0  1.252800e+09   \n",
       "\n",
       "                             category  \\\n",
       "3  [Books, Arts & Photography, Music]   \n",
       "4  [Books, Arts & Photography, Music]   \n",
       "\n",
       "                                        title          brand price  time  \n",
       "3  Heavenly Highway Hymns: Shaped-Note Hymnal  Stamps/Baxter        2012  \n",
       "4  Heavenly Highway Hymns: Shaped-Note Hymnal  Stamps/Baxter        2009  "
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rating_.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "# rating_['time'] = rating_['time'] - rating_['time'].min()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,\n",
       "       2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017,\n",
       "       2018])"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import numpy as np\n",
    "np.sort(rating_.time.unique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((1856344,), (704023,))"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rating_.uid.unique().shape, rating_.iid.unique().shape,"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "s_rating = rating_[rating_.time.isin([2017])].copy()\n",
    "s_rating['time'] = pd.to_datetime(s_rating.timestamp, unit='s').map(lambda x: x.month)\n",
    "s_rating = s_rating[s_rating.time.isin(range(1,13))]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(3933994, 9)"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s_rating.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>rating</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>880147.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>4.469701</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>10.909923</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>2.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>4.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>1070.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              rating\n",
       "count  880147.000000\n",
       "mean        4.469701\n",
       "std        10.909923\n",
       "min         1.000000\n",
       "25%         1.000000\n",
       "50%         2.000000\n",
       "75%         4.000000\n",
       "max      1070.000000"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s_rating.groupby('uid').agg({\"rating\":'count'}).describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "item_info = s_rating.groupby('iid').agg({\"rating\":'count'})\n",
    "user_info = s_rating.groupby('uid').agg({\"rating\":'count'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((34769,), (23051,))"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "active_item = item_info[item_info['rating']>20].index #.sample(frac=10/20,random_state=2023).index\n",
    "active_user = user_info[user_info['rating']>20].index #.sample(frac=10/20,random_state=2023).index\n",
    "active_item.shape, active_user.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(778957, 9)"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s_rating = s_rating[s_rating['uid'].isin(active_user)]\n",
    "s_rating = s_rating[s_rating['iid'].isin(active_item)]\n",
    "s_rating.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(rating    33.917835\n",
       " dtype: float64,\n",
       " rating    22.807865\n",
       " dtype: float64)"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "item_info = s_rating.groupby('iid').agg({\"rating\":'count'})\n",
    "user_info = s_rating.groupby('uid').agg({\"rating\":'count'})\n",
    "user_info.mean(), item_info.mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((778957, 9), (22966,), (34153,))"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s_rating.shape, s_rating.uid.unique().shape, s_rating.iid.unique().shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((22966,), (34153,))"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s_rating.uid.unique().shape, s_rating.iid.unique().shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>index</th>\n",
       "      <th>iid</th>\n",
       "      <th>uid</th>\n",
       "      <th>rating</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>category</th>\n",
       "      <th>title</th>\n",
       "      <th>brand</th>\n",
       "      <th>price</th>\n",
       "      <th>time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>123</td>\n",
       "      <td>0001050230</td>\n",
       "      <td>A2SLME0E0ENFCP</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1.514678e+09</td>\n",
       "      <td>[Books, Literature &amp; Fiction, Dramas &amp; Plays]</td>\n",
       "      <td>Love's Labour's Lost: Performed by Derek Jacob...</td>\n",
       "      <td>Visit Amazon's William Shakespeare Page</td>\n",
       "      <td>$20.93</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>125</td>\n",
       "      <td>0001050230</td>\n",
       "      <td>A2RTH3REZ7YEDB</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1.513642e+09</td>\n",
       "      <td>[Books, Literature &amp; Fiction, Dramas &amp; Plays]</td>\n",
       "      <td>Love's Labour's Lost: Performed by Derek Jacob...</td>\n",
       "      <td>Visit Amazon's William Shakespeare Page</td>\n",
       "      <td>$20.93</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   index         iid             uid  rating     timestamp  \\\n",
       "0    123  0001050230  A2SLME0E0ENFCP     5.0  1.514678e+09   \n",
       "1    125  0001050230  A2RTH3REZ7YEDB     5.0  1.513642e+09   \n",
       "\n",
       "                                        category  \\\n",
       "0  [Books, Literature & Fiction, Dramas & Plays]   \n",
       "1  [Books, Literature & Fiction, Dramas & Plays]   \n",
       "\n",
       "                                               title  \\\n",
       "0  Love's Labour's Lost: Performed by Derek Jacob...   \n",
       "1  Love's Labour's Lost: Performed by Derek Jacob...   \n",
       "\n",
       "                                     brand   price  time  \n",
       "0  Visit Amazon's William Shakespeare Page  $20.93    12  \n",
       "1  Visit Amazon's William Shakespeare Page  $20.93    12  "
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s_rating = s_rating.reset_index()\n",
    "s_rating.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<Axes: xlabel='time'>"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAjkAAAG0CAYAAADQLTb2AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy88F64QAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA6Y0lEQVR4nO3de3wNd/7H8ffJ5eSCk9QlifyEpKXIulVoeqrtoqlU0y5tWLoWVdXll7SNLFpdv1CUrq7rCtleiG1Zl9+2FiFoLN1WXBq0SqluafyWk2g1iUslJPP7wyOzzrq0IRwZr+fjMY+HM9/P+c5nxu2dOTNzbIZhGAIAALAYL083AAAAcD0QcgAAgCURcgAAgCURcgAAgCURcgAAgCURcgAAgCURcgAAgCX5eLoBT6qoqNCRI0dUp04d2Ww2T7cDAAB+AsMwdOLECYWHh8vL6/Lna27pkHPkyBFFRER4ug0AAHAVDh8+rEaNGl12/JYOOXXq1JF0/iA5HA4PdwMAAH6KkpISRUREmP+PX84tHXIqP6JyOByEHAAAapgfu9SEC48BAIAlEXIAAIAlEXIAAIAl3dLX5AAAcDXKy8t19uxZT7dhWb6+vvL29r7meQg5AAD8RIZhyOVyqaioyNOtWF5wcLDCwsKu6Tl2hBwAAH6iyoATEhKiwMBAHiR7HRiGodOnT6uwsFCS1LBhw6uei5ADAMBPUF5ebgacevXqebodSwsICJAkFRYWKiQk5Ko/uuLCYwAAfoLKa3ACAwM93MmtofI4X8u1T4QcAACqgI+obozqOM6EHAAAYEmEHAAA8JNFRkZqxowZnm7jJ+HCYwAArlHkS1k3dHuHXku47tvIzMxUSkrKRbfLb9++XbVq1bru268OhBwAAG4xZWVlstvtV/XeBg0aVHM31w8fVwEAYHGdO3dWcnKyUlJSVL9+fcXHx2vatGlq3bq1atWqpYiICP33f/+3Tp48KUnauHGjBg0apOLiYtlsNtlsNo0bN07SxR9X2Ww2vfXWW3r88ccVGBioZs2aacWKFW7bX7FihZo1ayZ/f3916dJFCxYskM1mu+4PVSTkAABwC1iwYIHsdrs+/vhjZWRkyMvLS7NmzdKePXu0YMECbdiwQaNGjZIk3XvvvZoxY4YcDoeOHj2qo0ePasSIEZed+5VXXtEvf/lLffbZZ3rkkUfUr18/HT9+XJJ08OBB9erVSz179tSnn36q3/zmN/rd7353Q/aZj6sAoAa4Htd83IjrOnDzaNasmaZMmWK+bt68ufnryMhITZw4UUOHDtWcOXNkt9sVFBQkm82msLCwH537qaee0pNPPilJmjRpkmbNmqVt27bp4Ycf1p/+9Cc1b95cr7/+urndzz//XK+++mo17+HFCDkAANwCYmJi3F5/8MEHmjx5svbt26eSkhKdO3dOZ86c0enTp6v8wMM2bdqYv65Vq5YcDof5tQz79+9Xx44d3ervvvvuq9yLquHjKgAAbgEX3hF16NAhPfroo2rTpo3++te/Ki8vT+np6ZLOX5RcVb6+vm6vbTabKioqrq3hasCZHAAAbjF5eXmqqKjQ1KlT5eV1/nzH0qVL3WrsdrvKy8uveVvNmzfX6tWr3dZt3779muf9KTiTAwDALaZp06Y6e/as/vjHP+rrr7/WO++8o4yMDLeayMhInTx5Ujk5Ofr22291+vTpq9rWb37zG+3bt08vvviivvzySy1dulSZmZmSrv9XZBByAAC4xbRt21bTpk3T73//e7Vq1UoLFy7U5MmT3WruvfdeDR06VH369FGDBg3cLlquiqioKP3v//6v3nvvPbVp00Zz5841767y8/O75n25EpthGMZ13cJNrKSkREFBQSouLpbD4fB0OwBwWdxd5XlnzpzRwYMHFRUVJX9/f0+3U6O9+uqrysjI0OHDhy9bc6Xj/VP//+aaHAC3PAIEcH3NmTNHHTt2VL169fTxxx/r9ddfV3Jy8nXfLiEHAABcVwcOHNDEiRN1/PhxNW7cWL/97W81evTo675dQg4AALiupk+frunTp9/w7XLhMQAAsCRCDgAAsKQqhZzIyEjz20gvXJKSkiSdvxI6KSlJ9erVU+3atZWYmKiCggK3OfLz85WQkKDAwECFhIRo5MiROnfunFvNxo0b1b59e/n5+alp06bm/fQXSk9PV2RkpPz9/RUbG6tt27ZVcdcBAKi6m+FJvreC6jjOVbomZ/v27W5PP/z888/10EMPqXfv3pKk4cOHKysrS8uWLVNQUJCSk5P1xBNP6OOPP5YklZeXKyEhQWFhYdq8ebOOHj2qAQMGyNfXV5MmTZJ0/ttKExISNHToUC1cuFA5OTl65pln1LBhQ8XHx0uSlixZotTUVGVkZCg2NlYzZsxQfHy89u/fr5CQkGs+KAAA/Ce73S4vLy8dOXJEDRo0kN1uv+4Ps7sVGYahsrIyHTt2TF5eXrLb7Vc91zU9JyclJUWrVq3SgQMHVFJSogYNGmjRokXq1auXJGnfvn1q2bKlcnNzdc8992jNmjV69NFHdeTIEYWGhkqSMjIy9OKLL+rYsWOy2+168cUXlZWVpc8//9zcTt++fVVUVKTs7GxJUmxsrDp27KjZs2dLOp/2IiIi9Nxzz+mll176yf3znBwAUs24hbwm9HgrKCsr09GjR6/66b/46QIDA9WwYcNLhpzr/pycsrIyvfvuu0pNTZXNZlNeXp7Onj2ruLg4s6ZFixZq3LixGXJyc3PVunVrM+BIUnx8vIYNG6Y9e/borrvuUm5urtsclTUpKSnmdvPy8txuPfPy8lJcXJxyc3Ov2HNpaalKS0vN1yUlJVe7+wCAS6juMHazBTG73a7GjRvr3Llz1fK9Trg0b29v+fj4XPOZsqsOOcuXL1dRUZGeeuopSZLL5ZLdbldwcLBbXWhoqFwul1lzYcCpHK8cu1JNSUmJfvjhB33//fcqLy+/ZM2+ffuu2PPkyZP1yiuvVGk/AQC4kM1mk6+v70XfvI2bz1XfXfX222+re/fuCg8Pr85+rqvRo0eruLjYXK70OGkAAFCzXdWZnG+++UYffPCB3nvvPXNdWFiYysrKVFRU5HY2p6CgQGFhYWbNf94FVXn31YU1/3lHVkFBgRwOhwICAuTt7S1vb+9L1lTOcTl+fn7X/cvAAADAzeGqzuTMnz9fISEhSkj492elMTEx8vX1VU5Ojrlu//79ys/Pl9PplCQ5nU7t3r1bhYWFZs369evlcDgUHR1t1lw4R2VN5Rx2u10xMTFuNRUVFcrJyTFrAAAAqnwmp6KiQvPnz9fAgQPl4/PvtwcFBWnw4MFKTU1V3bp15XA49Nxzz8npdOqee+6RJHXr1k3R0dHq37+/pkyZIpfLpTFjxigpKck8wzJ06FDNnj1bo0aN0tNPP60NGzZo6dKlysr698VsqampGjhwoDp06KC7775bM2bM0KlTpzRo0KBrPR4AAMAiqhxyPvjgA+Xn5+vpp5++aGz69Ony8vJSYmKiSktLFR8frzlz5pjj3t7eWrVqlYYNGyan06latWpp4MCBGj9+vFkTFRWlrKwsDR8+XDNnzlSjRo301ltvmc/IkaQ+ffro2LFjSktLk8vlUrt27ZSdnX3RxcgAAODWdU3PyanpeE4OAKlmPIOmJvQoWf8Wctwcfur/33x3FQAAsCRCDgAAsKSrfhggAAA1UU356A/XjjM5AADAkgg5AADAkvi4CqihOOUOAFfGmRwAAGBJhBwAAGBJhBwAAGBJhBwAAGBJXHgM4LriMf8APIUzOQAAwJIIOQAAwJIIOQAAwJIIOQAAwJIIOQAAwJIIOQAAwJK4hRw3HLcUAwBuBM7kAAAASyLkAAAASyLkAAAASyLkAAAASyLkAAAAS+LuKuASqvsOMIm7wADgRuNMDgAAsCRCDgAAsCRCDgAAsCRCDgAAsCRCDgAAsCRCDgAAsCRCDgAAsCRCDgAAsCQeBvgT8XA4AABqFs7kAAAASyLkAAAAS+LjKgAAbjJcIlE9OJMDAAAsqcoh51//+pd+/etfq169egoICFDr1q31ySefmOOGYSgtLU0NGzZUQECA4uLidODAAbc5jh8/rn79+snhcCg4OFiDBw/WyZMn3Wo+++wz3X///fL391dERISmTJlyUS/Lli1TixYt5O/vr9atW2v16tVV3R0AAGBRVQo533//vTp16iRfX1+tWbNGe/fu1dSpU3XbbbeZNVOmTNGsWbOUkZGhrVu3qlatWoqPj9eZM2fMmn79+mnPnj1av369Vq1apQ8//FDPPvusOV5SUqJu3bqpSZMmysvL0+uvv65x48bpjTfeMGs2b96sJ598UoMHD9bOnTvVs2dP9ezZU59//vm1HA8AAGARVbom5/e//70iIiI0f/58c11UVJT5a8MwNGPGDI0ZM0Y9evSQJP35z39WaGioli9frr59++qLL75Qdna2tm/frg4dOkiS/vjHP+qRRx7RH/7wB4WHh2vhwoUqKyvTvHnzZLfb9bOf/Uy7du3StGnTzDA0c+ZMPfzwwxo5cqQkacKECVq/fr1mz56tjIyMazsqAACgxqvSmZwVK1aoQ4cO6t27t0JCQnTXXXfpzTffNMcPHjwol8uluLg4c11QUJBiY2OVm5srScrNzVVwcLAZcCQpLi5OXl5e2rp1q1nzwAMPyG63mzXx8fHav3+/vv/+e7Pmwu1U1lRu51JKS0tVUlLitgAAAGuqUsj5+uuvNXfuXDVr1kxr167VsGHD9Pzzz2vBggWSJJfLJUkKDQ11e19oaKg55nK5FBIS4jbu4+OjunXrutVcao4Lt3G5msrxS5k8ebKCgoLMJSIioiq7DwAAapAqhZyKigq1b99ekyZN0l133aVnn31WQ4YMqTEfD40ePVrFxcXmcvjwYU+3BAAArpMqhZyGDRsqOjrabV3Lli2Vn58vSQoLC5MkFRQUuNUUFBSYY2FhYSosLHQbP3funI4fP+5Wc6k5LtzG5Woqxy/Fz89PDofDbQEAANZUpZDTqVMn7d+/323dl19+qSZNmkg6fxFyWFiYcnJyzPGSkhJt3bpVTqdTkuR0OlVUVKS8vDyzZsOGDaqoqFBsbKxZ8+GHH+rs2bNmzfr169W8eXPzTi6n0+m2ncqayu0AAIBbW5VCzvDhw7VlyxZNmjRJX331lRYtWqQ33nhDSUlJkiSbzaaUlBRNnDhRK1as0O7duzVgwACFh4erZ8+eks6f+Xn44Yc1ZMgQbdu2TR9//LGSk5PVt29fhYeHS5J+9atfyW63a/DgwdqzZ4+WLFmimTNnKjU11ezlhRdeUHZ2tqZOnap9+/Zp3Lhx+uSTT5ScnFxNhwYAANRkVbqFvGPHjnr//fc1evRojR8/XlFRUZoxY4b69etn1owaNUqnTp3Ss88+q6KiIt13333Kzs6Wv7+/WbNw4UIlJyfrwQcflJeXlxITEzVr1ixzPCgoSOvWrVNSUpJiYmJUv359paWluT1L595779WiRYs0ZswYvfzyy2rWrJmWL1+uVq1aXcvxAAAAFlHl76569NFH9eijj1523Gazafz48Ro/fvxla+rWratFixZdcTtt2rTRP/7xjyvW9O7dW717975ywwAA4JbEd1cBAABLIuQAAABLIuQAAABLIuQAAABLqvKFx7h5Rb6UVe1zHnotodrnBADgRuBMDgAAsCRCDgAAsCRCDgAAsCRCDgAAsCRCDgAAsCRCDgAAsCRCDgAAsCRCDgAAsCRCDgAAsCRCDgAAsCRCDgAAsCRCDgAAsCRCDgAAsCRCDgAAsCRCDgAAsCRCDgAAsCRCDgAAsCRCDgAAsCRCDgAAsCRCDgAAsCRCDgAAsCRCDgAAsCRCDgAAsCRCDgAAsCRCDgAAsCRCDgAAsCRCDgAAsCRCDgAAsCRCDgAAsCQfTzcAAABqpsiXsqp1vkOvJVTrfJzJAQAAlkTIAQAAllSlkDNu3DjZbDa3pUWLFub4mTNnlJSUpHr16ql27dpKTExUQUGB2xz5+flKSEhQYGCgQkJCNHLkSJ07d86tZuPGjWrfvr38/PzUtGlTZWZmXtRLenq6IiMj5e/vr9jYWG3btq0quwIAACyuymdyfvazn+no0aPm8tFHH5ljw4cP18qVK7Vs2TJt2rRJR44c0RNPPGGOl5eXKyEhQWVlZdq8ebMWLFigzMxMpaWlmTUHDx5UQkKCunTpol27diklJUXPPPOM1q5da9YsWbJEqampGjt2rHbs2KG2bdsqPj5ehYWFV3scAACAxVQ55Pj4+CgsLMxc6tevL0kqLi7W22+/rWnTpqlr166KiYnR/PnztXnzZm3ZskWStG7dOu3du1fvvvuu2rVrp+7du2vChAlKT09XWVmZJCkjI0NRUVGaOnWqWrZsqeTkZPXq1UvTp083e5g2bZqGDBmiQYMGKTo6WhkZGQoMDNS8efOq45gAAAALqHLIOXDggMLDw3X77berX79+ys/PlyTl5eXp7NmziouLM2tbtGihxo0bKzc3V5KUm5ur1q1bKzQ01KyJj49XSUmJ9uzZY9ZcOEdlTeUcZWVlysvLc6vx8vJSXFycWXM5paWlKikpcVsAAIA1VSnkxMbGKjMzU9nZ2Zo7d64OHjyo+++/XydOnJDL5ZLdbldwcLDbe0JDQ+VyuSRJLpfLLeBUjleOXammpKREP/zwg7799luVl5dfsqZyjsuZPHmygoKCzCUiIqIquw8AAGqQKj0np3v37uav27Rpo9jYWDVp0kRLly5VQEBAtTdX3UaPHq3U1FTzdUlJCUEHAACLuqZbyIODg3XnnXfqq6++UlhYmMrKylRUVORWU1BQoLCwMElSWFjYRXdbVb7+sRqHw6GAgADVr19f3t7el6ypnONy/Pz85HA43BYAAGBN1xRyTp48qX/+859q2LChYmJi5Ovrq5ycHHN8//79ys/Pl9PplCQ5nU7t3r3b7S6o9evXy+FwKDo62qy5cI7Kmso57Ha7YmJi3GoqKiqUk5Nj1gAAAFQp5IwYMUKbNm3SoUOHtHnzZj3++OPy9vbWk08+qaCgIA0ePFipqan6+9//rry8PA0aNEhOp1P33HOPJKlbt26Kjo5W//799emnn2rt2rUaM2aMkpKS5OfnJ0kaOnSovv76a40aNUr79u3TnDlztHTpUg0fPtzsIzU1VW+++aYWLFigL774QsOGDdOpU6c0aNCgajw0AACgJqvSNTn/93//pyeffFLfffedGjRooPvuu09btmxRgwYNJEnTp0+Xl5eXEhMTVVpaqvj4eM2ZM8d8v7e3t1atWqVhw4bJ6XSqVq1aGjhwoMaPH2/WREVFKSsrS8OHD9fMmTPVqFEjvfXWW4qPjzdr+vTpo2PHjiktLU0ul0vt2rVTdnb2RRcjAwCAW1eVQs7ixYuvOO7v76/09HSlp6dftqZJkyZavXr1Fefp3Lmzdu7cecWa5ORkJScnX7EGAADcuvjuKgAAYEmEHAAAYEmEHAAAYEmEHAAAYEmEHAAAYEmEHAAAYEmEHAAAYEmEHAAAYEmEHAAAYEmEHAAAYEmEHAAAYEmEHAAAYEmEHAAAYEmEHAAAYEmEHAAAYEmEHAAAYEmEHAAAYEmEHAAAYEmEHAAAYEmEHAAAYEmEHAAAYEmEHAAAYEmEHAAAYEmEHAAAYEmEHAAAYEmEHAAAYEmEHAAAYEmEHAAAYEmEHAAAYEmEHAAAYEmEHAAAYEmEHAAAYEmEHAAAYEmEHAAAYEmEHAAAYEmEHAAAYEmEHAAAYEnXFHJee+012Ww2paSkmOvOnDmjpKQk1atXT7Vr11ZiYqIKCgrc3pefn6+EhAQFBgYqJCREI0eO1Llz59xqNm7cqPbt28vPz09NmzZVZmbmRdtPT09XZGSk/P39FRsbq23btl3L7gAAAAu56pCzfft2/elPf1KbNm3c1g8fPlwrV67UsmXLtGnTJh05ckRPPPGEOV5eXq6EhASVlZVp8+bNWrBggTIzM5WWlmbWHDx4UAkJCerSpYt27dqllJQUPfPMM1q7dq1Zs2TJEqWmpmrs2LHasWOH2rZtq/j4eBUWFl7tLgEAAAu5qpBz8uRJ9evXT2+++aZuu+02c31xcbHefvttTZs2TV27dlVMTIzmz5+vzZs3a8uWLZKkdevWae/evXr33XfVrl07de/eXRMmTFB6errKysokSRkZGYqKitLUqVPVsmVLJScnq1evXpo+fbq5rWnTpmnIkCEaNGiQoqOjlZGRocDAQM2bN+9ajgcAALCIqwo5SUlJSkhIUFxcnNv6vLw8nT171m19ixYt1LhxY+Xm5kqScnNz1bp1a4WGhpo18fHxKikp0Z49e8ya/5w7Pj7enKOsrEx5eXluNV5eXoqLizNrLqW0tFQlJSVuCwAAsCafqr5h8eLF2rFjh7Zv337RmMvlkt1uV3BwsNv60NBQuVwus+bCgFM5Xjl2pZqSkhL98MMP+v7771VeXn7Jmn379l2298mTJ+uVV175aTsKAABqtCqdyTl8+LBeeOEFLVy4UP7+/terp+tm9OjRKi4uNpfDhw97uiUAAHCdVCnk5OXlqbCwUO3bt5ePj498fHy0adMmzZo1Sz4+PgoNDVVZWZmKiorc3ldQUKCwsDBJUlhY2EV3W1W+/rEah8OhgIAA1a9fX97e3pesqZzjUvz8/ORwONwWAABgTVUKOQ8++KB2796tXbt2mUuHDh3Ur18/89e+vr7Kyckx37N//37l5+fL6XRKkpxOp3bv3u12F9T69evlcDgUHR1t1lw4R2VN5Rx2u10xMTFuNRUVFcrJyTFrAADAra1K1+TUqVNHrVq1cltXq1Yt1atXz1w/ePBgpaamqm7dunI4HHruuefkdDp1zz33SJK6deum6Oho9e/fX1OmTJHL5dKYMWOUlJQkPz8/SdLQoUM1e/ZsjRo1Sk8//bQ2bNigpUuXKisry9xuamqqBg4cqA4dOujuu+/WjBkzdOrUKQ0aNOiaDggAALCGKl94/GOmT58uLy8vJSYmqrS0VPHx8ZozZ4457u3trVWrVmnYsGFyOp2qVauWBg4cqPHjx5s1UVFRysrK0vDhwzVz5kw1atRIb731luLj482aPn366NixY0pLS5PL5VK7du2UnZ190cXIAADg1nTNIWfjxo1ur/39/ZWenq709PTLvqdJkyZavXr1Feft3Lmzdu7cecWa5ORkJScn/+ReAQDArYPvrgIAAJZEyAEAAJZEyAEAAJZEyAEAAJZEyAEAAJZEyAEAAJZEyAEAAJZEyAEAAJZEyAEAAJZEyAEAAJZEyAEAAJZEyAEAAJZEyAEAAJZEyAEAAJZEyAEAAJZEyAEAAJZEyAEAAJZEyAEAAJZEyAEAAJZEyAEAAJZEyAEAAJZEyAEAAJZEyAEAAJZEyAEAAJZEyAEAAJZEyAEAAJZEyAEAAJZEyAEAAJZEyAEAAJZEyAEAAJZEyAEAAJZEyAEAAJZEyAEAAJZEyAEAAJZEyAEAAJZEyAEAAJZEyAEAAJZUpZAzd+5ctWnTRg6HQw6HQ06nU2vWrDHHz5w5o6SkJNWrV0+1a9dWYmKiCgoK3ObIz89XQkKCAgMDFRISopEjR+rcuXNuNRs3blT79u3l5+enpk2bKjMz86Je0tPTFRkZKX9/f8XGxmrbtm1V2RUAAGBxVQo5jRo10muvvaa8vDx98skn6tq1q3r06KE9e/ZIkoYPH66VK1dq2bJl2rRpk44cOaInnnjCfH95ebkSEhJUVlamzZs3a8GCBcrMzFRaWppZc/DgQSUkJKhLly7atWuXUlJS9Mwzz2jt2rVmzZIlS5SamqqxY8dqx44datu2reLj41VYWHitxwMAAFhElULOY489pkceeUTNmjXTnXfeqVdffVW1a9fWli1bVFxcrLffflvTpk1T165dFRMTo/nz52vz5s3asmWLJGndunXau3ev3n33XbVr107du3fXhAkTlJ6errKyMklSRkaGoqKiNHXqVLVs2VLJycnq1auXpk+fbvYxbdo0DRkyRIMGDVJ0dLQyMjIUGBioefPmVeOhAQAANdlVX5NTXl6uxYsX69SpU3I6ncrLy9PZs2cVFxdn1rRo0UKNGzdWbm6uJCk3N1etW7dWaGioWRMfH6+SkhLzbFBubq7bHJU1lXOUlZUpLy/PrcbLy0txcXFmzeWUlpaqpKTEbQEAANZU5ZCze/du1a5dW35+fho6dKjef/99RUdHy+VyyW63Kzg42K0+NDRULpdLkuRyudwCTuV45diVakpKSvTDDz/o22+/VXl5+SVrKue4nMmTJysoKMhcIiIiqrr7AACghqhyyGnevLl27dqlrVu3atiwYRo4cKD27t17PXqrdqNHj1ZxcbG5HD582NMtAQCA68Snqm+w2+1q2rSpJCkmJkbbt2/XzJkz1adPH5WVlamoqMjtbE5BQYHCwsIkSWFhYRfdBVV599WFNf95R1ZBQYEcDocCAgLk7e0tb2/vS9ZUznE5fn5+8vPzq+ouAwCAGuian5NTUVGh0tJSxcTEyNfXVzk5OebY/v37lZ+fL6fTKUlyOp3avXu3211Q69evl8PhUHR0tFlz4RyVNZVz2O12xcTEuNVUVFQoJyfHrAEAAKjSmZzRo0ere/fuaty4sU6cOKFFixZp48aNWrt2rYKCgjR48GClpqaqbt26cjgceu655+R0OnXPPfdIkrp166bo6Gj1799fU6ZMkcvl0pgxY5SUlGSeYRk6dKhmz56tUaNG6emnn9aGDRu0dOlSZWVlmX2kpqZq4MCB6tChg+6++27NmDFDp06d0qBBg6rx0AAAgJqsSiGnsLBQAwYM0NGjRxUUFKQ2bdpo7dq1euihhyRJ06dPl5eXlxITE1VaWqr4+HjNmTPHfL+3t7dWrVqlYcOGyel0qlatWho4cKDGjx9v1kRFRSkrK0vDhw/XzJkz1ahRI7311luKj483a/r06aNjx44pLS1NLpdL7dq1U3Z29kUXIwMAgFtXlULO22+/fcVxf39/paenKz09/bI1TZo00erVq684T+fOnbVz584r1iQnJys5OfmKNQAA4NbFd1cBAABLIuQAAABLIuQAAABLIuQAAABLIuQAAABLIuQAAABLIuQAAABLIuQAAABLIuQAAABLIuQAAABLIuQAAABLIuQAAABLIuQAAABLIuQAAABLIuQAAABLIuQAAABLIuQAAABLIuQAAABLIuQAAABLIuQAAABLIuQAAABLIuQAAABLIuQAAABLIuQAAABLIuQAAABLIuQAAABLIuQAAABLIuQAAABLIuQAAABLIuQAAABLIuQAAABLIuQAAABLIuQAAABLIuQAAABLIuQAAABLIuQAAABLIuQAAABLqlLImTx5sjp27Kg6deooJCREPXv21P79+91qzpw5o6SkJNWrV0+1a9dWYmKiCgoK3Gry8/OVkJCgwMBAhYSEaOTIkTp37pxbzcaNG9W+fXv5+fmpadOmyszMvKif9PR0RUZGyt/fX7Gxsdq2bVtVdgcAAFhYlULOpk2blJSUpC1btmj9+vU6e/asunXrplOnTpk1w4cP18qVK7Vs2TJt2rRJR44c0RNPPGGOl5eXKyEhQWVlZdq8ebMWLFigzMxMpaWlmTUHDx5UQkKCunTpol27diklJUXPPPOM1q5da9YsWbJEqampGjt2rHbs2KG2bdsqPj5ehYWF13I8AACARfhUpTg7O9vtdWZmpkJCQpSXl6cHHnhAxcXFevvtt7Vo0SJ17dpVkjR//ny1bNlSW7Zs0T333KN169Zp7969+uCDDxQaGqp27dppwoQJevHFFzVu3DjZ7XZlZGQoKipKU6dOlSS1bNlSH330kaZPn674+HhJ0rRp0zRkyBANGjRIkpSRkaGsrCzNmzdPL7300jUfGAAAULNd0zU5xcXFkqS6detKkvLy8nT27FnFxcWZNS1atFDjxo2Vm5srScrNzVXr1q0VGhpq1sTHx6ukpER79uwxay6co7Kmco6ysjLl5eW51Xh5eSkuLs6suZTS0lKVlJS4LQAAwJquOuRUVFQoJSVFnTp1UqtWrSRJLpdLdrtdwcHBbrWhoaFyuVxmzYUBp3K8cuxKNSUlJfrhhx/07bffqry8/JI1lXNcyuTJkxUUFGQuERERVd9xAABQI1x1yElKStLnn3+uxYsXV2c/19Xo0aNVXFxsLocPH/Z0SwAA4Dqp0jU5lZKTk7Vq1Sp9+OGHatSokbk+LCxMZWVlKioqcjubU1BQoLCwMLPmP++Cqrz76sKa/7wjq6CgQA6HQwEBAfL29pa3t/clayrnuBQ/Pz/5+flVfYcBAECNU6UzOYZhKDk5We+//742bNigqKgot/GYmBj5+voqJyfHXLd//37l5+fL6XRKkpxOp3bv3u12F9T69evlcDgUHR1t1lw4R2VN5Rx2u10xMTFuNRUVFcrJyTFrAADAra1KZ3KSkpK0aNEi/e1vf1OdOnXM61+CgoIUEBCgoKAgDR48WKmpqapbt64cDoeee+45OZ1O3XPPPZKkbt26KTo6Wv3799eUKVPkcrk0ZswYJSUlmWdZhg4dqtmzZ2vUqFF6+umntWHDBi1dulRZWVlmL6mpqRo4cKA6dOigu+++WzNmzNCpU6fMu60AAMCtrUohZ+7cuZKkzp07u62fP3++nnrqKUnS9OnT5eXlpcTERJWWlio+Pl5z5swxa729vbVq1SoNGzZMTqdTtWrV0sCBAzV+/HizJioqSllZWRo+fLhmzpypRo0a6a233jJvH5ekPn366NixY0pLS5PL5VK7du2UnZ190cXIAADg1lSlkGMYxo/W+Pv7Kz09Xenp6ZetadKkiVavXn3FeTp37qydO3desSY5OVnJyck/2hMAALj18N1VAADAkgg5AADAkgg5AADAkgg5AADAkgg5AADAkgg5AADAkgg5AADAkgg5AADAkgg5AADAkgg5AADAkgg5AADAkgg5AADAkgg5AADAkgg5AADAkgg5AADAkgg5AADAkgg5AADAkgg5AADAkgg5AADAkgg5AADAkgg5AADAkgg5AADAkgg5AADAkgg5AADAkgg5AADAkgg5AADAkgg5AADAkgg5AADAkgg5AADAkgg5AADAkgg5AADAkgg5AADAkgg5AADAkgg5AADAkgg5AADAkgg5AADAkgg5AADAkqoccj788EM99thjCg8Pl81m0/Lly93GDcNQWlqaGjZsqICAAMXFxenAgQNuNcePH1e/fv3kcDgUHByswYMH6+TJk241n332me6//375+/srIiJCU6ZMuaiXZcuWqUWLFvL391fr1q21evXqqu4OAACwqCqHnFOnTqlt27ZKT0+/5PiUKVM0a9YsZWRkaOvWrapVq5bi4+N15swZs6Zfv37as2eP1q9fr1WrVunDDz/Us88+a46XlJSoW7duatKkifLy8vT6669r3LhxeuONN8yazZs368knn9TgwYO1c+dO9ezZUz179tTnn39e1V0CAAAW5FPVN3Tv3l3du3e/5JhhGJoxY4bGjBmjHj16SJL+/Oc/KzQ0VMuXL1ffvn31xRdfKDs7W9u3b1eHDh0kSX/84x/1yCOP6A9/+IPCw8O1cOFClZWVad68ebLb7frZz36mXbt2adq0aWYYmjlzph5++GGNHDlSkjRhwgStX79es2fPVkZGxlUdDAAAYB3Vek3OwYMH5XK5FBcXZ64LCgpSbGyscnNzJUm5ubkKDg42A44kxcXFycvLS1u3bjVrHnjgAdntdrMmPj5e+/fv1/fff2/WXLidyprK7VxKaWmpSkpK3BYAAGBN1RpyXC6XJCk0NNRtfWhoqDnmcrkUEhLiNu7j46O6deu61Vxqjgu3cbmayvFLmTx5soKCgswlIiKiqrsIAABqiFvq7qrRo0eruLjYXA4fPuzplgAAwHVSrSEnLCxMklRQUOC2vqCgwBwLCwtTYWGh2/i5c+d0/Phxt5pLzXHhNi5XUzl+KX5+fnI4HG4LAACwpmoNOVFRUQoLC1NOTo65rqSkRFu3bpXT6ZQkOZ1OFRUVKS8vz6zZsGGDKioqFBsba9Z8+OGHOnv2rFmzfv16NW/eXLfddptZc+F2KmsqtwMAAG5tVQ45J0+e1K5du7Rr1y5J5y823rVrl/Lz82Wz2ZSSkqKJEydqxYoV2r17twYMGKDw8HD17NlTktSyZUs9/PDDGjJkiLZt26aPP/5YycnJ6tu3r8LDwyVJv/rVr2S32zV48GDt2bNHS5Ys0cyZM5Wammr28cILLyg7O1tTp07Vvn37NG7cOH3yySdKTk6+9qMCAABqvCrfQv7JJ5+oS5cu5uvK4DFw4EBlZmZq1KhROnXqlJ599lkVFRXpvvvuU3Z2tvz9/c33LFy4UMnJyXrwwQfl5eWlxMREzZo1yxwPCgrSunXrlJSUpJiYGNWvX19paWluz9K59957tWjRIo0ZM0Yvv/yymjVrpuXLl6tVq1ZXdSAAAIC1VDnkdO7cWYZhXHbcZrNp/PjxGj9+/GVr6tatq0WLFl1xO23atNE//vGPK9b07t1bvXv3vnLDAADglnRL3V0FAABuHYQcAABgSYQcAABgSYQcAABgSYQcAABgSYQcAABgSYQcAABgSYQcAABgSYQcAABgSYQcAABgSYQcAABgSYQcAABgSYQcAABgSYQcAABgSYQcAABgSYQcAABgSYQcAABgSYQcAABgSYQcAABgSYQcAABgSYQcAABgSYQcAABgSYQcAABgSYQcAABgSYQcAABgSYQcAABgSYQcAABgSYQcAABgSYQcAABgSYQcAABgSYQcAABgSYQcAABgSYQcAABgSYQcAABgSYQcAABgSYQcAABgSYQcAABgSTU+5KSnpysyMlL+/v6KjY3Vtm3bPN0SAAC4CdTokLNkyRKlpqZq7Nix2rFjh9q2bav4+HgVFhZ6ujUAAOBhNTrkTJs2TUOGDNGgQYMUHR2tjIwMBQYGat68eZ5uDQAAeJiPpxu4WmVlZcrLy9Po0aPNdV5eXoqLi1Nubu4l31NaWqrS0lLzdXFxsSSppKTkR7dXUXr6Gju+2E/ZblXUhB6l6u+zJvQo8ftdXWpCjxK/39WlJvQo8ftdXX5qj5V1hmFcudCoof71r38ZkozNmze7rR85cqRx9913X/I9Y8eONSSxsLCwsLCwWGA5fPjwFbNCjT2TczVGjx6t1NRU83VFRYWOHz+uevXqyWazXfP8JSUlioiI0OHDh+VwOK55vuulJvRJj9WnJvRJj9WnJvRJj9WnJvR5PXo0DEMnTpxQeHj4FetqbMipX7++vL29VVBQ4La+oKBAYWFhl3yPn5+f/Pz83NYFBwdXe28Oh+Om/cN2oZrQJz1Wn5rQJz1Wn5rQJz1Wn5rQZ3X3GBQU9KM1NfbCY7vdrpiYGOXk5JjrKioqlJOTI6fT6cHOAADAzaDGnsmRpNTUVA0cOFAdOnTQ3XffrRkzZujUqVMaNGiQp1sDAAAeVqNDTp8+fXTs2DGlpaXJ5XKpXbt2ys7OVmhoqEf68fPz09ixYy/6SOxmUxP6pMfqUxP6pMfqUxP6pMfqUxP69GSPNsP4sfuvAAAAap4ae00OAADAlRByAACAJRFyAACAJRFyAACAJRFygGvAdfsAcPOq0beQA57m5+enTz/9VC1btvR0K8BN6+jRo5o7d64++ugjHT16VF5eXrr99tvVs2dPPfXUU/L29vZ0i7AozuRcR4cPH9bTTz/t6Tb0ww8/6KOPPtLevXsvGjtz5oz+/Oc/e6Ard1988YXmz5+vffv2SZL27dunYcOG6emnn9aGDRs83N35B09eaikvL9drr71mvr7ZnDp1SvPnz9fvfvc7zZ49W999952nW9KOHTt08OBB8/U777yjTp06KSIiQvfdd58WL17swe7Oe+655/SPf/zD0238JLNnz9aAAQPM4/bOO+8oOjpaLVq00Msvv6xz5855tL9PPvlELVu21OrVq3X27FkdOHBAMTExqlWrlkaMGKEHHnhAJ06c8GiPsLBq+UpwXNKuXbsMLy8vj/awf/9+o0mTJobNZjO8vLyMBx54wDhy5Ig57nK5PN7jmjVrDLvdbtStW9fw9/c31qxZYzRo0MCIi4szunbtanh7exs5OTke7dFmsxnt2rUzOnfu7LbYbDajY8eORufOnY0uXbp4tEfDMIyWLVsa3333nWEYhpGfn29ERkYaQUFBRseOHY26desaISEhxtdff+3RHtu0aWOsX7/eMAzDePPNN42AgADj+eefN+bOnWukpKQYtWvXNt5++22P9lj596VZs2bGa6+9Zhw9etSj/VzOhAkTjDp16hiJiYlGWFiY8dprrxn16tUzJk6caEyaNMlo0KCBkZaW5tEeO3XqZIwbN858/c477xixsbGGYRjG8ePHjXbt2hnPP/+8p9pzU1paaixZssRISUkx+vbta/Tt29dISUkxli5dapSWlnq6vR/lcrmMV155xdNtGIZhGIcPHzZOnDhx0fqysjJj06ZNN6wPQs41+Nvf/nbFZfr06R4PED179jQSEhKMY8eOGQcOHDASEhKMqKgo45tvvjEM4+YIOU6n0/jd735nGIZh/OUvfzFuu+024+WXXzbHX3rpJeOhhx7yVHuGYRjG5MmTjaioqIvClo+Pj7Fnzx4PdXUxm81mFBQUGIZhGP369TPuvfdeo6ioyDAMwzhx4oQRFxdnPPnkk55s0QgICDAOHTpkGIZh3HXXXcYbb7zhNr5w4UIjOjraE62ZbDab8cEHHxgvvPCCUb9+fcPX19f4xS9+YaxcudIoLy/3aG8XuuOOO4y//vWvhmGc/6HK29vbePfdd83x9957z2jatKmn2jMM4/zv9z//+U/zdXl5ueHr62u4XC7DMAxj3bp1Rnh4uKfaMx04cMC4/fbbDX9/f+PnP/+58ctf/tL45S9/afz85z83/P39jaZNmxoHDhzwdJtXdDP8YH3kyBGjY8eOhpeXl+Ht7W3079/fLezc6P9zCDnXoPKnPZvNdtnF03/gQkJCjM8++8x8XVFRYQwdOtRo3Lix8c9//vOmCDkOh8P8x6O8vNzw8fExduzYYY7v3r3bCA0N9VR7pm3bthl33nmn8dvf/tYoKyszDOPmDjm33367sW7dOrfxjz/+2IiIiPBEa6Z69eoZn3zyiWEY5/987tq1y238q6++MgICAjzRmunC41hWVmYsWbLEiI+PN7y9vY3w8HDj5Zdfvin+wwsICDB/YDEMw/D19TU+//xz8/WhQ4eMwMBAT7RmatKkifHRRx+Zr48cOWLYbDbj9OnThmEYxsGDBw1/f39PtWeKi4szevToYRQXF180VlxcbPTo0cPo1q2bBzr7t08//fSKy5IlSzz+7/mAAQOM2NhYY/v27cb69euNmJgYo0OHDsbx48cNwzgfcmw22w3rh5BzDcLDw43ly5dfdnznzp0e/wNXp04dY+/evRetT0pKMho1amR8+OGHHu/R4XAYX331lfm6du3abj/5HTp06Kb4R9Awzp8NGTBggNGmTRtj9+7dhq+v700XcgoLCw3DOP/nc/fu3W7jN8Ox/PWvf20MHjzYMAzD6N27tzFmzBi38UmTJhmtW7f2RGumC0POhb755htj7NixRpMmTTz+98YwDCMqKspYs2aNYRiG8eWXXxpeXl7G0qVLzfGsrCwjMjLSU+0ZhmEYL7zwgtGqVStjzZo1xoYNG4wuXboYnTt3Nsezs7ONO+64w4MdnhcQEHDR35cLffbZZzdF+L7cD9aV6z395zI8PNzYunWr+frMmTPGY489ZrRr18747rvvbvgP1txddQ1iYmKUl5enHj16XHLcZrN5/BbjFi1amBf+XWj27NmSpF/84heeaMtNZGSkDhw4oDvuuEOSlJubq8aNG5vj+fn5atiwoafac1O7dm0tWLBAixcvVlxcnMrLyz3d0kUefPBB+fj4qKSkRPv371erVq3MsW+++Ub16tXzYHfS73//e3Xq1Ek///nP1aFDB02dOlUbN25Uy5YttX//fm3ZskXvv/++R3u8nMaNG2vcuHEaO3asPvjgA0+3o379+mnAgAHq0aOHcnJyNGrUKI0YMULfffedbDabXn31VfXq1cujPU6cOFFHjx7VY489pvLycjmdTr377rvmuM1m0+TJkz3Y4XnBwcE6dOiQ29+XCx06dEjBwcE3tqn/ULduXU2ZMkUPPvjgJcf37Nmjxx577AZ35a64uFi33Xab+drPz0/vvfeeevfurS5durj93t8IhJxrMHLkSJ06deqy402bNtXf//73G9jRxR5//HH95S9/Uf/+/S8amz17tioqKpSRkeGBzv5t2LBhbmHhP/+RWbNmjbp27Xqj27qivn376r777lNeXp6aNGni6XZMY8eOdXtdu3Ztt9crV67U/ffffyNbukh4eLh27typ1157TStXrpRhGNq2bZsOHz6sTp066eOPP1aHDh082mOTJk2ueFuzzWbTQw89dAM7urRXXnlFAQEBys3N1ZAhQ/TSSy+pbdu2GjVqlE6fPq3HHntMEyZM8GiPtWvX1pIlS3TmzBmdO3fuoj+T3bp181Bn7p555hkNGDBA//M//6MHH3xQoaGhkqSCggLl5ORo4sSJeu655zzaY0xMjI4cOXLZf3OKioo8/oP17bffrs8++0zNmjUz1/n4+GjZsmXq3bu3Hn300RvaD99CDgCAzp9lnDlzplwul2w2m6TzD/wMCwtTSkqKRo0a5dH+3n//fZ06dUq//vWvLzn+/fffa8WKFRo4cOAN7uzfXnzxRe3atUtr1669aOzcuXNKTEzUypUrVVFRcUP6IeQAAHCBgwcPyuVySZLCwsIUFRXl4Y5qjnPnzun06dNyOByXHf/Xv/51w86A8zBAAAAuEBUVJafTKafTaQacm+XhrldyM/To4+Nz2YAjnX/69SuvvHLD+uFMDgAAP+LTTz9V+/btb8qbDSrR48W48BgAcMtbsWLFFce//vrrG9TJ5dFj1XEmBwBwy/Py8vrRx37YbDaPniWhx6vo54ZsBQCAm1jDhg313nvvqaKi4pLLjh07PN0iPV4FQg4A4JZX+XDXy7kZHu5Kj1XHNTkAgFteTXi4Kz1WHdfkAAAAS+LjKgAAYEmEHAAAYEmEHAAAYEmEHAAAYEmEHAA1ysaNG2Wz2VRUVOTpVgDc5Li7CsBNrXPnzmrXrp1mzJghSSorK9Px48cVGhoqm83m2eYA3NR4Tg6AGsVutyssLMzTbQCoAfi4CsBN66mnntKmTZs0c+ZM2Ww22Ww2ZWZmun1clZmZqeDgYK1atUrNmzdXYGCgevXqpdOnT2vBggWKjIzUbbfdpueff97t+3JKS0s1YsQI/dd//Zdq1aql2NhYbdy40TM7CuC64EwOgJvWzJkz9eWXX6pVq1YaP368JGnPnj0X1Z0+fVqzZs3S4sWLdeLECT3xxBN6/PHHFRwcrNWrV+vrr79WYmKiOnXqpD59+kiSkpOTtXfvXi1evFjh4eF6//339fDDD2v37t1q1qzZDd1PANcHIQfATSsoKEh2u12BgYHmR1T79u27qO7s2bOaO3eu7rjjDklSr1699M4776igoEC1a9dWdHS0unTpor///e/q06eP8vPzNX/+fOXn5ys8PFySNGLECGVnZ2v+/PmaNGnSjdtJANcNIQdAjRcYGGgGHEkKDQ1VZGSkateu7bausLBQkrR7926Vl5frzjvvdJuntLRU9erVuzFNA7juCDkAajxfX1+31zab7ZLrKioqJEknT56Ut7e38vLy5O3t7VZ3YTACULMRcgDc1Ox2u9sFw9XhrrvuUnl5uQoLC3X//fdX69wAbh7cXQXgphYZGamtW7fq0KFD+vbbb82zMdfizjvvVL9+/TRgwAC99957OnjwoLZt26bJkycrKyurGroGcDMg5AC4qY0YMULe3t6Kjo5WgwYNlJ+fXy3zzp8/XwMGDNBvf/tbNW/eXD179tT27dvVuHHjapkfgOfxxGMAAGBJnMkBAACWRMgBAACWRMgBAACWRMgBAACWRMgBAACWRMgBAACWRMgBAACWRMgBAACWRMgBAACWRMgBAACWRMgBAACW9P+mY5AbK28KTwAAAABJRU5ErkJggg==",
      "text/plain": [
       "<Figure size 640x480 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "s_rating.groupby('time').agg({'rating':'count'}).reset_index().plot(x='time',kind='bar')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([5., 4., 3., 2., 1.])"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s_rating.rating.unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "count    778957.000000\n",
       "mean          0.633102\n",
       "std           0.481959\n",
       "min           0.000000\n",
       "25%           0.000000\n",
       "50%           1.000000\n",
       "75%           1.000000\n",
       "max           1.000000\n",
       "Name: label, dtype: float64"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s_rating['label'] = s_rating['rating'].apply(lambda x: 1 if x>=5 else 0)\n",
    "s_rating['label'].describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(22966, 34153)"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "users = s_rating.uid.unique()\n",
    "items = s_rating.iid.unique()\n",
    "users_map = dict(zip(users, np.arange(users.shape[0])+1))\n",
    "items_map = dict(zip(items, np.arange(items.shape[0])+1))\n",
    "# users_map[0]=0\n",
    "# items_map[0]=0\n",
    "s_rating['uid'] = s_rating['uid'].map(users_map)\n",
    "s_rating['iid'] = s_rating['iid'].map(items_map)\n",
    "s_rating.uid.max(), s_rating.iid.max()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((727463, 11), (25747, 11), (25747, 11))"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rating_train = s_rating[s_rating.time.isin(range(1,12))].copy()\n",
    "rating_valid_test = s_rating[s_rating.time.isin([12])].copy()\n",
    "rating_valid_test.sort_values(by=\"timestamp\",inplace=True)\n",
    "N_ = rating_valid_test.shape[0]//2\n",
    "# rating_p = rating_valid_test.iloc[0:N_].copy()\n",
    "# rating_train = pd.concat([rating_train, rating_p],axis=0)\n",
    "rating_valid = rating_valid_test.iloc[:N_].copy()\n",
    "rating_test = rating_valid_test.iloc[N_:].copy()\n",
    "rating_train.shape, rating_valid.shape, rating_test.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0, 1, 2, 3, 4])"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rating_valid_test.timestamp.values[0:5].argsort()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(array([11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1]), array([12]))"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rating_train.time.unique(),rating_valid.time.unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "rating    12.623846\n",
       "dtype: float64"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s_rating[s_rating.time.isin(range(7,13))].groupby('iid').agg({\"rating\":'count'}).mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_user = rating_train['uid'].unique()\n",
    "train_item = rating_train['iid'].unique()\n",
    "rating_valid['not_cold'] = rating_valid[['uid','iid']].apply(lambda x: x.uid in train_user and x.iid in train_item, axis=1).astype(\"int\")\n",
    "rating_test['not_cold'] = rating_test[['uid','iid']].apply(lambda x: x.uid in train_user and x.iid in train_item, axis=1).astype(\"int\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(count    25747.000000\n",
       " mean         0.707539\n",
       " std          0.454902\n",
       " min          0.000000\n",
       " 25%          0.000000\n",
       " 50%          1.000000\n",
       " 75%          1.000000\n",
       " max          1.000000\n",
       " Name: not_cold, dtype: float64,\n",
       " count    25747.000000\n",
       " mean         0.677322\n",
       " std          0.467510\n",
       " min          0.000000\n",
       " 25%          0.000000\n",
       " 50%          1.000000\n",
       " 75%          1.000000\n",
       " max          1.000000\n",
       " Name: not_cold, dtype: float64)"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rating_valid['not_cold'].describe(), rating_test['not_cold'].describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((727463, 11), (25747, 12), (25747, 12))"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rating_train.shape, rating_valid.shape, rating_test.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(              label              \n",
       "               count           sum\n",
       " count  22854.000000  22854.000000\n",
       " mean      31.830883     20.171042\n",
       " std       34.845966     25.668595\n",
       " min        1.000000      0.000000\n",
       " 25%       15.000000      7.000000\n",
       " 50%       22.000000     14.000000\n",
       " 75%       36.000000     24.000000\n",
       " max      865.000000    851.000000,\n",
       "               label              \n",
       "               count           sum\n",
       " count  33713.000000  33713.000000\n",
       " mean      21.578115     13.673924\n",
       " std       33.510363     23.416971\n",
       " min        1.000000      0.000000\n",
       " 25%        5.000000      3.000000\n",
       " 50%       12.000000      7.000000\n",
       " 75%       24.000000     14.000000\n",
       " max     1060.000000    464.000000)"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rating_train.groupby(\"uid\").agg({'label':['count','sum']}).describe(), rating_train.groupby(\"iid\").agg({'label':['count','sum']}).describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(              label              \n",
       "               count           sum\n",
       " count  10207.000000  10207.000000\n",
       " mean       2.522485      1.565004\n",
       " std        2.684147      2.017672\n",
       " min        1.000000      0.000000\n",
       " 25%        1.000000      0.000000\n",
       " 50%        2.000000      1.000000\n",
       " 75%        3.000000      2.000000\n",
       " max       78.000000     36.000000,\n",
       "              label             \n",
       "              count          sum\n",
       " count  9997.000000  9997.000000\n",
       " mean      2.575473     1.597879\n",
       " std       8.157588     5.848883\n",
       " min       1.000000     0.000000\n",
       " 25%       1.000000     0.000000\n",
       " 50%       1.000000     1.000000\n",
       " 75%       2.000000     1.000000\n",
       " max     305.000000   202.000000)"
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# rating_valid.groupby(\"uid\").agg({'label':['count','sum']}).describe()\n",
    "rating_test.groupby(\"uid\").agg({'label':['count','sum']}).describe(), rating_test.groupby(\"iid\").agg({'label':['count','sum']}).describe()\n",
    "# rating_valid_test.groupby(\"uid\").agg({'label':['count','sum']}).describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "# train_slot = list(range(26,36))\n",
    "# valid_slot = list(range(36,37))\n",
    "# test_slot = list(range(37,38))\n",
    "# rating_  = rating_all[rating_all['time'].isin(list(range(26,38)))].copy().reset_index()\n",
    "# train_slot = list(range(2010,2017))\n",
    "# valid_slot = list(range(34,36))\n",
    "# test_slot = list(range(36,38))\n",
    "# rating_  = rating_all[rating_all['time'].isin(list(range(24,38)))].copy().reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "# rating_['rating'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [],
   "source": [
    "# rating_['label'] = rating_['rating'].apply(lambda x: 1 if x>4 else 0)\n",
    "# rating_.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [],
   "source": [
    "# rating_.label.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [],
   "source": [
    "# rating_.uid.min(), rating_.uid.max(), rating_.iid.min(),rating_.iid.max()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [],
   "source": [
    "# rating_.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [],
   "source": [
    "# rating_.shape[0]/(rating_.uid.max()*rating_.iid.max())"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## split by time order"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [],
   "source": [
    "# rating_train = rating_[rating_['time'].isin(train_slot)].copy()\n",
    "# rating_valid = rating_[rating_['time'].isin(valid_slot)].copy()\n",
    "# rating_test = rating_[rating_['time'].isin(test_slot)].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((727463, 11), (25747, 12), (25747, 12))"
      ]
     },
     "execution_count": 65,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rating_train.shape, rating_valid.shape, rating_test.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [],
   "source": [
    "rating_valid_f = rating_valid\n",
    "rating_test_f = rating_test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [],
   "source": [
    "def filter_cold_start(train,valid,test):\n",
    "    train_user = train.uid.unique()\n",
    "    train_item = train.iid.unique()\n",
    "    valid = valid[valid['uid'].isin(train_user)]\n",
    "    test = test[test['uid'].isin(train_user)]\n",
    "    valid = valid[valid['iid'].isin(train_item)]\n",
    "    test = test[test['iid'].isin(train_item)]\n",
    "    return valid, test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [],
   "source": [
    "# rating_valid_f, rating_test_f = filter_cold_start(rating_train,rating_valid,rating_test)\n",
    "# rating_valid_f.shape, rating_test_f.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0.6336940847850681, 0.6290441604847167, 0.6204217967141803)"
      ]
     },
     "execution_count": 69,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rating_train.label.mean(), rating_valid.label.mean(), rating_test.label.mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0.6290441604847167, 0.6204217967141803)"
      ]
     },
     "execution_count": 70,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rating_valid_f.label.mean(), rating_test_f.label.mean() "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['index', 'iid', 'uid', 'rating', 'timestamp', 'category', 'title',\n",
       "       'brand', 'price', 'time', 'label', 'not_cold'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 71,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rating_valid_f.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [],
   "source": [
    "import copy\n",
    "def deal_with_each_u(x,u):\n",
    "    items = np.array(x.iid)\n",
    "    labels = np.array(x.label)\n",
    "    titles = np.array(x.title)\n",
    "    timestamp = np.array(x.timestamp)\n",
    "    flags =  np.array(x.flag) \n",
    "    his = [0] # adding a '0' by default\n",
    "    his_title = ['']\n",
    "    results = []\n",
    "    for i in range(items.shape[0]):\n",
    "        results.append((u, items[i], timestamp[i], np.array(his), copy.copy(his_title),titles[i], labels[i], flags[i]))\n",
    "        # training data\n",
    "        if labels[i] > 0: # positive \n",
    "            his.append(items[i])\n",
    "            his_title.append(titles[i])\n",
    "    return results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [],
   "source": [
    "rating_train = rating_train.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [],
   "source": [
    "rating_train['flag'] =  pd.DataFrame(np.ones(rating_train.shape[0])*-1, index=rating_train.index)\n",
    "rating_valid_f['flag'] = pd.DataFrame(np.zeros(rating_valid_f.shape[0]), index=rating_valid_f.index)\n",
    "rating_test_f['flag'] = pd.DataFrame(np.ones(rating_test_f.shape[0]), index=rating_test_f.index)\n",
    "data_ = pd.concat([rating_train, rating_valid_f, rating_test_f],axis=0,ignore_index=True)\n",
    "data_ = data_.sort_values(by=['uid','timestamp'])\n",
    "u_inter_all = data_.groupby('uid').agg({'iid':list, 'label':list, 'title':list, 'timestamp':list,'flag':list})\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([-1.,  0.,  1.])"
      ]
     },
     "execution_count": 75,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_.flag.unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [],
   "source": [
    "results = []\n",
    "for u in u_inter_all.index:\n",
    "    results.extend(deal_with_each_u(u_inter_all.loc[u],u))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [],
   "source": [
    "u_, i_, time_, label_, his_, his_title, title_,flag_ = [],[],[],[],[],[],[],[]\n",
    "for re_ in results:\n",
    "    u_.append(re_[0])\n",
    "    i_.append(re_[1])\n",
    "    time_.append(re_[2])\n",
    "    his_.append(re_[3])\n",
    "    his_title.append(re_[4])\n",
    "    title_.append(re_[5])\n",
    "    label_.append(re_[6])\n",
    "    flag_.append(re_[7])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_ = pd.DataFrame({\"uid\":u_,'iid':i_,'label':label_, 'timestamp': time_ , 'his':his_,'his_title':his_title,'title':title_, 'flag': flag_})\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>uid</th>\n",
       "      <th>iid</th>\n",
       "      <th>label</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>his</th>\n",
       "      <th>his_title</th>\n",
       "      <th>title</th>\n",
       "      <th>flag</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>3054</td>\n",
       "      <td>1</td>\n",
       "      <td>1.483920e+09</td>\n",
       "      <td>[0]</td>\n",
       "      <td>[]</td>\n",
       "      <td>ENDURANCE: SHACKLETON'S INCREDIBLE VOYAGE by L...</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1985</td>\n",
       "      <td>0</td>\n",
       "      <td>1.484525e+09</td>\n",
       "      <td>[0, 3054]</td>\n",
       "      <td>[, ENDURANCE: SHACKLETON'S INCREDIBLE VOYAGE b...</td>\n",
       "      <td>Norwegian Wood</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   uid   iid  label     timestamp        his  \\\n",
       "0    1  3054      1  1.483920e+09        [0]   \n",
       "1    1  1985      0  1.484525e+09  [0, 3054]   \n",
       "\n",
       "                                           his_title  \\\n",
       "0                                                 []   \n",
       "1  [, ENDURANCE: SHACKLETON'S INCREDIBLE VOYAGE b...   \n",
       "\n",
       "                                               title  flag  \n",
       "0  ENDURANCE: SHACKLETON'S INCREDIBLE VOYAGE by L...  -1.0  \n",
       "1                                     Norwegian Wood  -1.0  "
      ]
     },
     "execution_count": 79,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "count    778957.000000\n",
       "mean          0.633102\n",
       "std           0.481959\n",
       "min           0.000000\n",
       "25%           0.000000\n",
       "50%           1.000000\n",
       "75%           1.000000\n",
       "max           1.000000\n",
       "Name: label, dtype: float64"
      ]
     },
     "execution_count": 80,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_.label.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>uid</th>\n",
       "      <th>iid</th>\n",
       "      <th>label</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>his</th>\n",
       "      <th>his_title</th>\n",
       "      <th>title</th>\n",
       "      <th>flag</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>778952</th>\n",
       "      <td>22964</td>\n",
       "      <td>33545</td>\n",
       "      <td>1</td>\n",
       "      <td>1.510790e+09</td>\n",
       "      <td>[0, 33545, 33545, 33545, 33545, 33545, 33545, ...</td>\n",
       "      <td>[, Garth Brooks The Anthology: The First Five ...</td>\n",
       "      <td>Garth Brooks The Anthology: The First Five Years</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>778953</th>\n",
       "      <td>22964</td>\n",
       "      <td>33545</td>\n",
       "      <td>1</td>\n",
       "      <td>1.510790e+09</td>\n",
       "      <td>[0, 33545, 33545, 33545, 33545, 33545, 33545, ...</td>\n",
       "      <td>[, Garth Brooks The Anthology: The First Five ...</td>\n",
       "      <td>Garth Brooks The Anthology: The First Five Years</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>778954</th>\n",
       "      <td>22964</td>\n",
       "      <td>33545</td>\n",
       "      <td>1</td>\n",
       "      <td>1.510790e+09</td>\n",
       "      <td>[0, 33545, 33545, 33545, 33545, 33545, 33545, ...</td>\n",
       "      <td>[, Garth Brooks The Anthology: The First Five ...</td>\n",
       "      <td>Garth Brooks The Anthology: The First Five Years</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>778955</th>\n",
       "      <td>22965</td>\n",
       "      <td>33724</td>\n",
       "      <td>1</td>\n",
       "      <td>1.502237e+09</td>\n",
       "      <td>[0]</td>\n",
       "      <td>[]</td>\n",
       "      <td>The Day Of The Jackal</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>778956</th>\n",
       "      <td>22966</td>\n",
       "      <td>33875</td>\n",
       "      <td>1</td>\n",
       "      <td>1.508803e+09</td>\n",
       "      <td>[0]</td>\n",
       "      <td>[]</td>\n",
       "      <td>The Angel Experiment: A Maximum Ride Novel</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          uid    iid  label     timestamp  \\\n",
       "778952  22964  33545      1  1.510790e+09   \n",
       "778953  22964  33545      1  1.510790e+09   \n",
       "778954  22964  33545      1  1.510790e+09   \n",
       "778955  22965  33724      1  1.502237e+09   \n",
       "778956  22966  33875      1  1.508803e+09   \n",
       "\n",
       "                                                      his  \\\n",
       "778952  [0, 33545, 33545, 33545, 33545, 33545, 33545, ...   \n",
       "778953  [0, 33545, 33545, 33545, 33545, 33545, 33545, ...   \n",
       "778954  [0, 33545, 33545, 33545, 33545, 33545, 33545, ...   \n",
       "778955                                                [0]   \n",
       "778956                                                [0]   \n",
       "\n",
       "                                                his_title  \\\n",
       "778952  [, Garth Brooks The Anthology: The First Five ...   \n",
       "778953  [, Garth Brooks The Anthology: The First Five ...   \n",
       "778954  [, Garth Brooks The Anthology: The First Five ...   \n",
       "778955                                                 []   \n",
       "778956                                                 []   \n",
       "\n",
       "                                                   title  flag  \n",
       "778952  Garth Brooks The Anthology: The First Five Years  -1.0  \n",
       "778953  Garth Brooks The Anthology: The First Five Years  -1.0  \n",
       "778954  Garth Brooks The Anthology: The First Five Years  -1.0  \n",
       "778955                             The Day Of The Jackal  -1.0  \n",
       "778956        The Angel Experiment: A Maximum Ride Novel  -1.0  "
      ]
     },
     "execution_count": 81,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_.tail(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((727463, 8), (25747, 8), (25747, 8))"
      ]
     },
     "execution_count": 82,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_ = data_[data_['flag'].isin([-1])].copy()\n",
    "valid_ = data_[data_['flag'].isin([0])].copy()\n",
    "test_ = data_[data_['flag'].isin([1])].copy()\n",
    "train_.shape,valid_.shape,test_.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_user = train_['uid'].unique()\n",
    "train_item = train_['iid'].unique()\n",
    "valid_['not_cold'] = valid_[['uid','iid']].apply(lambda x: x.uid in train_user and x.iid in train_item, axis=1).astype(\"int\")\n",
    "test_['not_cold'] = test_[['uid','iid']].apply(lambda x: x.uid in train_user and x.iid in train_item, axis=1).astype(\"int\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(count    25747.000000\n",
       " mean         0.707539\n",
       " std          0.454902\n",
       " min          0.000000\n",
       " 25%          0.000000\n",
       " 50%          1.000000\n",
       " 75%          1.000000\n",
       " max          1.000000\n",
       " Name: not_cold, dtype: float64,\n",
       " count    25747.000000\n",
       " mean         0.677322\n",
       " std          0.467510\n",
       " min          0.000000\n",
       " 25%          0.000000\n",
       " 50%          1.000000\n",
       " 75%          1.000000\n",
       " max          1.000000\n",
       " Name: not_cold, dtype: float64)"
      ]
     },
     "execution_count": 84,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "valid_['not_cold'].describe(), test_['not_cold'].describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>uid</th>\n",
       "      <th>iid</th>\n",
       "      <th>label</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>his</th>\n",
       "      <th>his_title</th>\n",
       "      <th>title</th>\n",
       "      <th>flag</th>\n",
       "      <th>not_cold</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>3054</td>\n",
       "      <td>1</td>\n",
       "      <td>1.483920e+09</td>\n",
       "      <td>[0]</td>\n",
       "      <td>[]</td>\n",
       "      <td>ENDURANCE: SHACKLETON'S INCREDIBLE VOYAGE by L...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1985</td>\n",
       "      <td>0</td>\n",
       "      <td>1.484525e+09</td>\n",
       "      <td>[0, 3054]</td>\n",
       "      <td>[, ENDURANCE: SHACKLETON'S INCREDIBLE VOYAGE b...</td>\n",
       "      <td>Norwegian Wood</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   uid   iid  label     timestamp        his  \\\n",
       "0    1  3054      1  1.483920e+09        [0]   \n",
       "1    1  1985      0  1.484525e+09  [0, 3054]   \n",
       "\n",
       "                                           his_title  \\\n",
       "0                                                 []   \n",
       "1  [, ENDURANCE: SHACKLETON'S INCREDIBLE VOYAGE b...   \n",
       "\n",
       "                                               title  flag  not_cold  \n",
       "0  ENDURANCE: SHACKLETON'S INCREDIBLE VOYAGE by L...  -1.0         1  \n",
       "1                                     Norwegian Wood  -1.0         1  "
      ]
     },
     "execution_count": 85,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_['not_cold'] = pd.DataFrame(np.ones(train_.shape[0]),index=train_.index).astype(\"int\")\n",
    "train_.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {},
   "outputs": [],
   "source": [
    "save_path = \"/data/zyang/datasets/amazon_book_new/\"\n",
    "train_.to_pickle(save_path+\"train_ood2.pkl\")\n",
    "valid_.to_pickle(save_path+\"valid_ood2.pkl\")\n",
    "test_.to_pickle(save_path+\"test_ood2.pkl\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>uid</th>\n",
       "      <th>iid</th>\n",
       "      <th>label</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>his</th>\n",
       "      <th>his_title</th>\n",
       "      <th>title</th>\n",
       "      <th>flag</th>\n",
       "      <th>not_cold</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>3054</td>\n",
       "      <td>1</td>\n",
       "      <td>1.483920e+09</td>\n",
       "      <td>[0]</td>\n",
       "      <td>[]</td>\n",
       "      <td>ENDURANCE: SHACKLETON'S INCREDIBLE VOYAGE by L...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1985</td>\n",
       "      <td>0</td>\n",
       "      <td>1.484525e+09</td>\n",
       "      <td>[0, 3054]</td>\n",
       "      <td>[, ENDURANCE: SHACKLETON'S INCREDIBLE VOYAGE b...</td>\n",
       "      <td>Norwegian Wood</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   uid   iid  label     timestamp        his  \\\n",
       "0    1  3054      1  1.483920e+09        [0]   \n",
       "1    1  1985      0  1.484525e+09  [0, 3054]   \n",
       "\n",
       "                                           his_title  \\\n",
       "0                                                 []   \n",
       "1  [, ENDURANCE: SHACKLETON'S INCREDIBLE VOYAGE b...   \n",
       "\n",
       "                                               title  flag  not_cold  \n",
       "0  ENDURANCE: SHACKLETON'S INCREDIBLE VOYAGE by L...  -1.0         1  \n",
       "1                                     Norwegian Wood  -1.0         1  "
      ]
     },
     "execution_count": 87,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\"ENDURANCE: SHACKLETON'S INCREDIBLE VOYAGE by Lansing, Alfred ( Author ) on Apr-24-1999[ Paperback ]\""
      ]
     },
     "execution_count": 88,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_.title.iloc[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(25747, 9)"
      ]
     },
     "execution_count": 89,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "valid_.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [],
   "source": [
    "valid_small = valid_.sample(frac=0.25,random_state=2023)\n",
    "valid_small.to_pickle(save_path+\"valid_small_ood2.pkl\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(6437, 9)"
      ]
     },
     "execution_count": 91,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "valid_small.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {},
   "outputs": [],
   "source": [
    "# valid_small.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "count    727463.000000\n",
       "mean         22.816884\n",
       "std          39.927104\n",
       "min           1.000000\n",
       "25%           5.000000\n",
       "50%          11.000000\n",
       "75%          25.000000\n",
       "max         851.000000\n",
       "Name: his, dtype: float64"
      ]
     },
     "execution_count": 93,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_['his'].apply(len).describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0.6336940847850681, 0.6204217967141803, 0.6290441604847167)"
      ]
     },
     "execution_count": 94,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_.label.mean(), test_.label.mean(), valid_.label.mean()  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr th {\n",
       "        text-align: left;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th colspan=\"2\" halign=\"left\">label</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th>count</th>\n",
       "      <th>sum</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>10207.000000</td>\n",
       "      <td>10207.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>2.522485</td>\n",
       "      <td>1.565004</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>2.684147</td>\n",
       "      <td>2.017672</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>2.000000</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>3.000000</td>\n",
       "      <td>2.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>78.000000</td>\n",
       "      <td>36.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              label              \n",
       "              count           sum\n",
       "count  10207.000000  10207.000000\n",
       "mean       2.522485      1.565004\n",
       "std        2.684147      2.017672\n",
       "min        1.000000      0.000000\n",
       "25%        1.000000      0.000000\n",
       "50%        2.000000      1.000000\n",
       "75%        3.000000      2.000000\n",
       "max       78.000000     36.000000"
      ]
     },
     "execution_count": 95,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_.groupby(\"uid\").agg({\"label\":['count','sum']}).describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr th {\n",
       "        text-align: left;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th colspan=\"2\" halign=\"left\">label</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th>count</th>\n",
       "      <th>sum</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>33713.000000</td>\n",
       "      <td>33713.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>21.578115</td>\n",
       "      <td>13.673924</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>33.510363</td>\n",
       "      <td>23.416971</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>5.000000</td>\n",
       "      <td>3.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>12.000000</td>\n",
       "      <td>7.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>24.000000</td>\n",
       "      <td>14.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>1060.000000</td>\n",
       "      <td>464.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              label              \n",
       "              count           sum\n",
       "count  33713.000000  33713.000000\n",
       "mean      21.578115     13.673924\n",
       "std       33.510363     23.416971\n",
       "min        1.000000      0.000000\n",
       "25%        5.000000      3.000000\n",
       "50%       12.000000      7.000000\n",
       "75%       24.000000     14.000000\n",
       "max     1060.000000    464.000000"
      ]
     },
     "execution_count": 96,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_.groupby(\"iid\").agg({\"label\":['count','sum']}).describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(10217,)"
      ]
     },
     "execution_count": 97,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "valid_[valid_['uid'].isin(train_['uid'].unique())].uid.unique().shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 3054,  1985, 24657, ..., 33983, 27284, 27395])"
      ]
     },
     "execution_count": 98,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_['iid'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(8464,)"
      ]
     },
     "execution_count": 99,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "valid_[valid_['iid'].isin(train_['iid'].unique())].iid.unique().shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((8749,), (9997,))"
      ]
     },
     "execution_count": 100,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "valid_.iid.unique().shape, test_.iid.unique().shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((10270,), (10207,))"
      ]
     },
     "execution_count": 101,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "valid_.uid.unique().shape, test_.uid.unique().shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((22854,), (33713,))"
      ]
     },
     "execution_count": 102,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_.uid.unique().shape, train_.iid.unique().shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((66,), (53,))"
      ]
     },
     "execution_count": 103,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n",
    "np.setdiff1d(test_.uid.unique(),train_.uid.unique()).shape, np.setdiff1d(valid_.uid.unique(),train_.uid.unique()).shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((392,), (285,))"
      ]
     },
     "execution_count": 104,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.setdiff1d(test_.iid.unique(),train_.iid.unique()).shape, np.setdiff1d(valid_.iid.unique(),train_.iid.unique()).shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {},
   "outputs": [],
   "source": [
    "# m = train_.groupby('uid').agg({'label':'count'}).sort_values('label').reset_index()\n",
    "# m.plot(x='uid',kind='bar')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {},
   "outputs": [],
   "source": [
    "# train_.uid.max()\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {},
   "outputs": [],
   "source": [
    "# save_path = \"/data/zyang/datasets/ml-1m/\"\n",
    "# train_ = pd.read_pickle(save_path+\"train_ood2.pkl\")\n",
    "# valid_ = pd.read_pickle(save_path+\"valid_ood2.pkl\")\n",
    "# test_ = pd.read_pickle(save_path+\"test_ood2.pkl\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(22967, 34154)"
      ]
     },
     "execution_count": 108,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "max(train_.uid.max(),test_.uid.max(),valid_.uid.max())+1, max(train_.iid.max(),test_.iid.max(),valid_.iid.max())+1\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {},
   "outputs": [],
   "source": [
    "# valid_small = pd.read_pickle(save_path+\"valid_small_ood2.pkl\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {},
   "outputs": [],
   "source": [
    "# valid_small.uid.max(), valid_small.iid.max()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {},
   "outputs": [],
   "source": [
    "# save_path = \"/data/zyang/datasets/amazon/\"\n",
    "# train = pd.read_pickle(save_path+\"train_ood2.pkl\")\n",
    "# valid = pd.read_pickle(save_path+\"valid_ood2.pkl\")\n",
    "# test = pd.read_pickle(save_path+\"test_ood2.pkl\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr th {\n",
       "        text-align: left;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th colspan=\"2\" halign=\"left\">label</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th>count</th>\n",
       "      <th>mean</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>10270.000000</td>\n",
       "      <td>10270.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>2.507011</td>\n",
       "      <td>0.630202</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>2.584496</td>\n",
       "      <td>0.422755</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>2.000000</td>\n",
       "      <td>0.937255</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>3.000000</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>45.000000</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              label              \n",
       "              count          mean\n",
       "count  10270.000000  10270.000000\n",
       "mean       2.507011      0.630202\n",
       "std        2.584496      0.422755\n",
       "min        1.000000      0.000000\n",
       "25%        1.000000      0.000000\n",
       "50%        2.000000      0.937255\n",
       "75%        3.000000      1.000000\n",
       "max       45.000000      1.000000"
      ]
     },
     "execution_count": 113,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "valid_.groupby(\"uid\").agg({\"label\":['count','mean']}).describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr th {\n",
       "        text-align: left;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th colspan=\"2\" halign=\"left\">label</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th>count</th>\n",
       "      <th>mean</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>10207.000000</td>\n",
       "      <td>10207.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>2.522485</td>\n",
       "      <td>0.631107</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>2.684147</td>\n",
       "      <td>0.424097</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>2.000000</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>3.000000</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>78.000000</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              label              \n",
       "              count          mean\n",
       "count  10207.000000  10207.000000\n",
       "mean       2.522485      0.631107\n",
       "std        2.684147      0.424097\n",
       "min        1.000000      0.000000\n",
       "25%        1.000000      0.000000\n",
       "50%        2.000000      1.000000\n",
       "75%        3.000000      1.000000\n",
       "max       78.000000      1.000000"
      ]
     },
     "execution_count": 114,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_.groupby(\"uid\").agg({\"label\":['count','mean']}).describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## read info"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "save_path = \"/data/zyang/datasets/amazon_book_new/\"\n",
    "train_ = pd.read_pickle(save_path+\"train_ood2.pkl\")\n",
    "valid_ = pd.read_pickle(save_path+\"valid_ood2.pkl\")\n",
    "test_ = pd.read_pickle(save_path+\"test_ood2.pkl\")\n",
    "data = pd.concat([train_,valid_,test_],axis=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr th {\n",
       "        text-align: left;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th colspan=\"2\" halign=\"left\">label</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th>sum</th>\n",
       "      <th>count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>22966.000000</td>\n",
       "      <td>22966.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>21.473439</td>\n",
       "      <td>33.917835</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>27.302854</td>\n",
       "      <td>37.038077</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>7.000000</td>\n",
       "      <td>16.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>15.000000</td>\n",
       "      <td>23.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>26.000000</td>\n",
       "      <td>39.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>905.000000</td>\n",
       "      <td>919.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              label              \n",
       "                sum         count\n",
       "count  22966.000000  22966.000000\n",
       "mean      21.473439     33.917835\n",
       "std       27.302854     37.038077\n",
       "min        0.000000      1.000000\n",
       "25%        7.000000     16.000000\n",
       "50%       15.000000     23.000000\n",
       "75%       26.000000     39.000000\n",
       "max      905.000000    919.000000"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.groupby(\"uid\").agg({\"label\":['sum','count']}).describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "minigpt4",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.16"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
